{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "67c1e6b1",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "下面的例子将展示词向量标准工具包——gensim提供的词嵌入，并展示词嵌入如何表示词的相似度。\n",
    "<!-- https://nlp.stanford.edu/projects/glove/ -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5c5a740a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pprint\n",
    "\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "# 从GloVe官网下载GloVe向量，此处使用的是glove.6B.zip\n",
    "# 解压缩zip文件并将以下路径改为解压后对应文件的路径\n",
    "model = KeyedVectors.load_word2vec_format('/your/path/here'+\\\n",
    "    '/glove.6B.100d.txt', binary=False, no_header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "01a2e4a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('movie', 0.9055121541023254),\n",
      " ('films', 0.8914433717727661),\n",
      " ('directed', 0.8124364018440247),\n",
      " ('documentary', 0.8075793981552124),\n",
      " ('drama', 0.7929168939590454),\n",
      " ('movies', 0.7889865040779114),\n",
      " ('comedy', 0.7842751741409302),\n",
      " ('starring', 0.7573286294937134),\n",
      " ('cinema', 0.7419455647468567),\n",
      " ('hollywood', 0.7307389378547668)]\n",
      "[('vehicle', 0.8630837798118591),\n",
      " ('truck', 0.8597878813743591),\n",
      " ('cars', 0.837166965007782),\n",
      " ('driver', 0.8185911178588867),\n",
      " ('driving', 0.7812635898590088),\n",
      " ('motorcycle', 0.7553157210350037),\n",
      " ('vehicles', 0.7462256550788879),\n",
      " ('parked', 0.74594646692276),\n",
      " ('bus', 0.7372707724571228),\n",
      " ('taxi', 0.7155268788337708)]\n"
     ]
    }
   ],
   "source": [
    "# 使用most_similar()找到词表中距离给定词最近（最相似）的n个词\n",
    "pprint.pprint(model.most_similar('film'))\n",
    "pprint.pprint(model.most_similar('car'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8b62f7ad",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "japanese\n",
      "panda\n",
      "longest\n",
      "terrible\n",
      "queen\n"
     ]
    }
   ],
   "source": [
    "# 利用GloVe展示一个类比的例子\n",
    "def analogy(x1, x2, y1):\n",
    "    # 寻找top-N最相似的词。\n",
    "    result = model.most_similar(positive=[y1, x2], negative=[x1])\n",
    "    return result[0][0]\n",
    "\n",
    "print(analogy('china', 'chinese', 'japan'))\n",
    "print(analogy('australia', 'koala', 'china'))\n",
    "print(analogy('tall', 'tallest', 'long'))\n",
    "print(analogy('good', 'fantastic', 'bad'))\n",
    "print(analogy('man', 'woman', 'king'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c308cee",
   "metadata": {},
   "source": [
    "下面将展示word2vec的代码，包括文本预处理、skipgram算法的实现、以及使用PyTorch进行优化。这里使用《小王子》这本书作为训练语料。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "590fc408",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 安装NLTK，使用如下代码下载punkt组件\n",
    "#import nltk\n",
    "#nltk.download('punkt')\n",
    "\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "\n",
    "# 使用类管理数据对象，包括文本读取、文本预处理等\n",
    "class TheLittlePrinceDataset:\n",
    "    def __init__(self, tokenize=True):\n",
    "        # 利用NLTK函数进行分句和分词\n",
    "        text = open('the little prince.txt', 'r', encoding='utf-8').read()\n",
    "        if tokenize:\n",
    "            self.sentences = sent_tokenize(text.lower())\n",
    "            self.tokens = [word_tokenize(sent) for sent in self.sentences]\n",
    "        else:\n",
    "            self.text = text\n",
    "\n",
    "    def build_vocab(self, min_freq=1):\n",
    "        # 统计词频\n",
    "        frequency = defaultdict(int)\n",
    "        for sentence in self.tokens:\n",
    "            for token in sentence:\n",
    "                frequency[token] += 1\n",
    "        self.frequency = frequency\n",
    "\n",
    "        # 加入<unk>处理未登录词，加入<pad>用于对齐变长输入进而加速\n",
    "        self.token2id = {'<unk>': 1, '<pad>': 0}\n",
    "        self.id2token = {1: '<unk>', 0: '<pad>'}\n",
    "        for token, freq in sorted(frequency.items(), key=lambda x: -x[1]):\n",
    "            # 丢弃低频词\n",
    "            if freq > min_freq:\n",
    "                self.token2id[token] = len(self.token2id)\n",
    "                self.id2token[len(self.id2token)] = token\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    def get_word_distribution(self):\n",
    "        distribution = np.zeros(vocab_size)\n",
    "        for token, freq in self.frequency.items():\n",
    "            if token in dataset.token2id:\n",
    "                distribution[dataset.token2id[token]] = freq\n",
    "            else:\n",
    "                # 不在词表中的词按<unk>计算\n",
    "                distribution[1] += freq\n",
    "        distribution /= distribution.sum()\n",
    "        return distribution\n",
    "\n",
    "    # 将分词结果转化为索引表示\n",
    "    def convert_tokens_to_ids(self, drop_single_word=True):\n",
    "        self.token_ids = []\n",
    "        for sentence in self.tokens:\n",
    "            token_ids = [self.token2id.get(token, 1) for token in sentence]\n",
    "            # 忽略只有一个token的序列，无法计算loss\n",
    "            if len(token_ids) == 1 and drop_single_word:\n",
    "                continue\n",
    "            self.token_ids.append(token_ids)\n",
    "        \n",
    "        return self.token_ids\n",
    "\n",
    "dataset = TheLittlePrinceDataset()\n",
    "dataset.build_vocab(min_freq=1)\n",
    "sentences = dataset.convert_tokens_to_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "efc882de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(74374, 2) [[  4  17]\n",
      " [  4  20]\n",
      " [ 17   4]\n",
      " ...\n",
      " [131   2]\n",
      " [  2  86]\n",
      " [  2 131]]\n"
     ]
    }
   ],
   "source": [
    "# 遍历所有的中心词-上下文词对\n",
    "window_size = 2\n",
    "data = []\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(i-window_size, i+window_size+1):\n",
    "            if j == i or j < 0 or j >= len(sentence):\n",
    "                continue\n",
    "            center_word = sentence[i]\n",
    "            context_word = sentence[j]\n",
    "            data.append([center_word, context_word])\n",
    "\n",
    "# 需要提前安装numpy\n",
    "import numpy as np\n",
    "data = np.array(data)\n",
    "print(data.shape, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "30903b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 需要提前安装PyTorch\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# 实现skipgram算法，使用对比学习计算损失\n",
    "class SkipGramNCE(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_size, distribution,\\\n",
    "                 neg_samples=20):\n",
    "        super(SkipGramNCE, self).__init__()\n",
    "        print(f'vocab_size = {vocab_size}, embed_size = {embed_size}, '+\\\n",
    "              f'neg_samples = {neg_samples}')\n",
    "        self.input_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        self.output_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        distribution = np.power(distribution, 0.75)\n",
    "        distribution /= distribution.sum()\n",
    "        self.distribution = torch.tensor(distribution)\n",
    "        self.neg_samples = neg_samples\n",
    "        \n",
    "    def forward(self, input_ids, labels):\n",
    "        i_embed = self.input_embeddings(input_ids)\n",
    "        o_embed = self.output_embeddings(labels)\n",
    "        batch_size = i_embed.size(0)\n",
    "        n_words = torch.multinomial(self.distribution, batch_size * \\\n",
    "            self.neg_samples, replacement=True).view(batch_size, -1)\n",
    "        n_embed = self.output_embeddings(n_words)\n",
    "        pos_term = F.logsigmoid(torch.sum(i_embed * o_embed, dim=1))\n",
    "        # 负采样，用于对比学习\n",
    "        neg_term = F.logsigmoid(- torch.bmm(n_embed, \\\n",
    "            i_embed.unsqueeze(2)).squeeze())\n",
    "        neg_term = torch.sum(neg_term, dim=1)\n",
    "        loss = - torch.mean(pos_term + neg_term)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1d9da6c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00000000e+00 4.95799942e-02 5.48904123e-02 ... 9.65530559e-05\n",
      " 9.65530559e-05 9.65530559e-05]\n",
      "vocab_size = 1071, embed_size = 128, neg_samples = 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch-99, loss=2.8468: 100%|█| 100/100 [05:03<00:00,  3.04s/\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAGwCAYAAABcnuQpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAABO/0lEQVR4nO3deVyU1f4H8M8zM8ywDSCgIAqKSu64ay6pZWpqZtmulf26Nyv3lmv7zW43tc2sLLtttprWzcxbaaIpau6IirgrCiqIsq8zzMz5/THMA8MmzgzzMPJ5v168kpln4PiE8OF7vuccSQghQEREROShVEoPgIiIiMgZDDNERETk0RhmiIiIyKMxzBAREZFHY5ghIiIij8YwQ0RERB6NYYaIiIg8mkbpATQ0i8WCCxcuQK/XQ5IkpYdDRERE9SCEQEFBASIiIqBS1V17uebDzIULFxAZGan0MIiIiMgBaWlpaN26dZ3XXPNhRq/XA7DejICAAIVHQ0RERPWRn5+PyMhI+ed4Xa75MGObWgoICGCYISIi8jD1aRFhAzARERF5NIYZIiIi8mgMM0REROTRGGaIiIjIozHMEBERkUdjmCEiIiKPxjBDREREHo1hhoiIiDwawwwRERF5NIYZIiIi8mgMM0REROTRGGaIiIjIo13zB002lEKDCbnFRvh4qRHir1N6OERERE0WKzMO+vKvFAx5YxPe+uOY0kMhIiJq0hhmHOSltt46o9mi8EiIiIiaNoYZB2k11ltXZhYKj4SIiKhpY5hxkK0yU2ZiZYaIiEhJDDMO0nKaiYiIqFFgmHGQl0YCAJQxzBARESmKYcZBcgMwp5mIiIgUxTDjINs0EyszREREymKYcZCXhj0zREREjQHDjIPkyoyJS7OJiIiUxDDjIC9OMxERETUKDDMO0nKaiYiIqFFgmHGQl5pLs4mIiBoDhhkHabk0m4iIqFFgmHFQRc8MG4CJiIiUxDDjIPbMEBERNQ4MMw6qvJpJCFZniIiIlMIw4yBbz4wQgMnCMENERKQUhhkH2Q6aBLiiiYiISEkMMw6yVWYA7gJMRESkJIYZB6lVEqTy4gybgImIiJTDMOMgSZLkJmCGGSIiIuUwzDih4rBJhhkiIiKlMMw4gUcaEBERKY9hxgncOI+IiEh5DDNO8OL5TERERIpjmHGCluczERERKY5hxgmVjzQgIiIiZTDMOIE9M0RERMpjmHGCvJqJPTNERESKYZhxAjfNIyIiUh7DjBNs00zsmSEiIlIOw4wTKnYA5momIiIipTDMOIHTTERERMprNGFmwYIFkCQJc+bMkR8TQmDevHmIiIiAj48Phg8fjuTkZOUGWYWXhpvmERERKa1RhJk9e/bgk08+QWxsrN3jb775JhYtWoQlS5Zgz549CA8Px8iRI1FQUKDQSO3xbCYiIiLlKR5mCgsLMXnyZHz66ado1qyZ/LgQAosXL8aLL76IiRMnolu3bvjqq69QXFyM5cuX1/rxDAYD8vPz7d4aipab5hERESlO8TAzffp0jBs3DjfffLPd4ykpKcjIyMCoUaPkx3Q6HYYNG4bt27fX+vEWLFiAwMBA+S0yMrLBxl6xaR4bgImIiJSiaJhZsWIF9u3bhwULFlR7LiMjAwAQFhZm93hYWJj8XE2ef/555OXlyW9paWmuHXQlPGiSiIhIeRqlPnFaWhpmz56N9evXw9vbu9brJEmye18IUe2xynQ6HXQ6ncvGWReezURERKQ8xSozCQkJyMzMRJ8+faDRaKDRaBAfH4/3338fGo1GrshUrcJkZmZWq9YoRcsGYCIiIsUpFmZGjBiBpKQk7N+/X37r27cvJk+ejP3796Ndu3YIDw9HXFyc/Bqj0Yj4+HgMGjRIqWHb4Q7AREREylNsmkmv16Nbt252j/n5+SEkJER+fM6cOZg/fz5iYmIQExOD+fPnw9fXF5MmTVJiyNXYppkM7JkhIiJSjGJhpj7mzp2LkpISTJs2DTk5ORgwYADWr18PvV6v9NAAVO6Z4WomIiIipTSqMLN582a79yVJwrx58zBv3jxFxnMlth2Ay1iZISIiUozi+8x4Mh1XMxERESmOYcYJXhrraiYeNElERKQchhkncNM8IiIi5THMOIGb5hERESmPYcYJFfvMcDUTERGRUhhmnMBTs4mIiJTHMOME9swQEREpj2HGCV5qrmYiIiJSGsOME9gATEREpDyGGSfo2ABMRESkOIYZJ7BnhoiISHkMM06wnc3EnhkiIiLlMMw4wdYAXGa2QAhONRERESmBYcYJOrUaACAEYLYwzBARESmBYcYJtoMmAU41ERERKYVhxgm2BmAAKDOxMkNERKQEhhknaFSszBARESmNYcYJkiRVOmySYYaIiEgJDDNO4mGTREREymKYcZJ8PhM3ziMiIlIEw4yT5F2AWZkhIiJSBMOMkyoOm+RqJiIiIiUwzDhJxwZgIiIiRTHMOImHTRIRESmLYcZJtl2A2TNDRESkDIYZJ8k9M6zMEBERKYJhxklaNgATEREpimHGSbYdgI1ms8IjISIiapoYZpxUMc3EygwREZESGGacJO8AzAZgIiIiRTDMOEmrUQPgPjNERERKYZhxEs9mIiIiUhbDjJN4ajYREZGyGGacVHHQJBuAiYiIlMAw4yQvVmaIiIgUxTDjJNs+M9wBmIiISBkMM07Scmk2ERGRohhmnMRpJiIiImUxzDjJy3acAXcAJiIiUgTDjJO4NJuIiEhZDDNOqqjMMMwQEREpgWHGSbYGYFZmiIiIlMEw46SKTfMYZoiIiJTAMOMkeZ8ZhhkiIiJFMMw4Sa7MsGeGiIhIEQwzTqpYzcSl2UREREpgmHESN80jIiJSFsOMk2w9M2wAJiIiUgbDjJO8bGczsWeGiIhIEQwzTuI0ExERkbIYZpxUsTSbDcBERERKYJhxklyZ4TQTERGRIhhmnGSrzBg4zURERKQIhhkneVU6m0kITjURERG5G8OMk2yb5gkBmC0MM0RERO7GMOMkW88MwCZgIiIiJTDMOMnWMwNw4zwiIiIlMMw4SaOS5D9z4zwiIiL3Y5hxkiRJlQ6bZJghIiJyN4YZF6i8oomIiIjci2HGBSp2AWaYISIicjeGGRewrWgysGeGiIjI7RhmXKDisEkuzSYiInI3hhkX4DQTERGRchhmXEBuAOY0ExERkdsxzLgAD5skIiJSDsOMC8g9M6zMEBERuR3DjAuwAZiIiEg5ioaZpUuXIjY2FgEBAQgICMDAgQOxdu1a+XkhBObNm4eIiAj4+Phg+PDhSE5OVnDENeMOwERERMpRNMy0bt0aCxcuxN69e7F3717cdNNNmDBhghxY3nzzTSxatAhLlizBnj17EB4ejpEjR6KgoEDJYVdj65nh2UxERETup2iYGT9+PMaOHYvrrrsO1113HV5//XX4+/tj586dEEJg8eLFePHFFzFx4kR069YNX331FYqLi7F8+XIlh12NbTUTT80mIiJyv0bTM2M2m7FixQoUFRVh4MCBSElJQUZGBkaNGiVfo9PpMGzYMGzfvr3Wj2MwGJCfn2/31tC8OM1ERESkGMXDTFJSEvz9/aHT6fD444/j559/RpcuXZCRkQEACAsLs7s+LCxMfq4mCxYsQGBgoPwWGRnZoOMH2DNDRESkJMXDTMeOHbF//37s3LkTTzzxBKZMmYLDhw/Lz0uSZHe9EKLaY5U9//zzyMvLk9/S0tIabOw2FTsAczUTERGRu2mUHoBWq0WHDh0AAH379sWePXvw3nvv4dlnnwUAZGRkoGXLlvL1mZmZ1ao1lel0Ouh0uoYddBU8aJKIiEg5ildmqhJCwGAwIDo6GuHh4YiLi5OfMxqNiI+Px6BBgxQcYXXsmSEiIlKOopWZF154AWPGjEFkZCQKCgqwYsUKbN68GevWrYMkSZgzZw7mz5+PmJgYxMTEYP78+fD19cWkSZOUHHY1XhqezURERKQURcPMxYsX8eCDDyI9PR2BgYGIjY3FunXrMHLkSADA3LlzUVJSgmnTpiEnJwcDBgzA+vXrodfrlRx2NTpWZoiIiBSjaJj5/PPP63xekiTMmzcP8+bNc8+AHGSbZuI+M0RERO7X6HpmPJGXvAMwVzMRERG5G8OMC7ABmIiISDkMMy6gLT/OgGGGiIjI/RhmXIAHTRIRESmHYcYF2ABMRESkHIYZF2DPDBERkXIYZlygIsxwNRMREZG7Mcy4gI49M0RERIphmHEBTjMREREph2HGBbzKl2azAZiIiMj9GGZcwLYDMCszRERE7scw4wJa2zQTjzMgIiJyO4YZF5A3zWNlhoiIyO0YZlxAbgDmaiYiIiK3Y5hxATYAExERKYdhxgW0XJpNRESkGIYZF7D1zFgEYGKgISIiciuGGRew9cwAPNKAiIjI3RhmXKBymGHfDBERkXsxzLiArQEYYN8MERGRuzHMuIAkSXITMA+bJCIici+GGRexVWdYmSEiInIvhhkX4flMREREymCYcREveZqJq5mIiIjciWHGReSeGVZmiIiI3IphxkW0nGYiIiJSBMOMi8gNwFzNRERE5FYMMy7ixWkmIiIiRTDMuEjFNBMbgImIiNyJYcZFvLhpHhERkSIYZlzEtpqJDcBERETuxTDjIrYGYPbMEBERuRfDjIt4sTJDRESkCIYZF7E1ALNnhoiIyL0YZlykvj0zWYUGJJzNdseQiIiImgSGGRepmGaqe2n2kz8cwJ1Ld+DQ+Tx3DIuIiOia51CY+eqrr/Dbb7/J78+dOxdBQUEYNGgQzp4967LBeRIvTXkD8BWmmU5lFgIA0rKLG3xMRERETYFDYWb+/Pnw8fEBAOzYsQNLlizBm2++idDQUDz55JMuHaCn0KrVAOpezSSEwOVCAwCgwGByy7iIiIiudRpHXpSWloYOHToAAFavXo277roLU6dOxeDBgzF8+HBXjs9j2CozdZ3NVGQ0w1D+fGEpwwwREZErOFSZ8ff3R1ZWFgBg/fr1uPnmmwEA3t7eKCkpcd3oPEh9GoCzyqsyAFDIygwREZFLOFSZGTlyJP7+97+jV69eOH78OMaNGwcASE5ORtu2bV05Po9RcdBk7Q3AlwuN8p8ZZoiIiFzDocrMhx9+iIEDB+LSpUv46aefEBISAgBISEjA/fff79IBeor67DOTXVQRZgo4zUREROQSDlVmgoKCsGTJkmqPv/rqq04PyFPVZwdgTjMRERG5nkOVmXXr1mHbtm3y+x9++CF69uyJSZMmIScnx2WD8yTa8rOZ6gwzlSozhaVlDT4mIiKipsChMPOPf/wD+fn5AICkpCQ8/fTTGDt2LE6fPo2nnnrKpQP0FPWpzFxmZYaIiMjlHJpmSklJQZcuXQAAP/30E2699VbMnz8f+/btw9ixY106QE9RnwbgrEL2zBAREbmaQ5UZrVaL4mLrDrYbNmzAqFGjAADBwcFyxaapqWgANtd6TVYRKzNERESu5lBlZsiQIXjqqacwePBg7N69GytXrgQAHD9+HK1bt3bpAD1Ffc5myuLSbCIiIpdzqDKzZMkSaDQa/Pe//8XSpUvRqlUrAMDatWtxyy23uHSAnkKrudoGYBOEqPtQSiIiIroyhyozUVFR+PXXX6s9/u677zo9IE8l98zUss+MxSLs9pkxWQRKyyzw0ardMj4iIqJrlUNhBgDMZjNWr16NI0eOQJIkdO7cGRMmTIBa3TR/OGvlBuCaw0xeSRnMFmslRpIAIYACQxnDDBERkZMcCjMnT57E2LFjcf78eXTs2BFCCBw/fhyRkZH47bff0L59e1ePs9Hz0tS9NNvW/Bvo4wWLRaDAYEJhqQkt9G4bIhER0TXJoZ6ZWbNmoX379khLS8O+ffuQmJiI1NRUREdHY9asWa4eo0eQD5o01dwHYzuXKcRPC39va4ZkEzAREZHzHKrMxMfHY+fOnQgODpYfCwkJwcKFCzF48GCXDc6TXGnTPFu/TIi/FnklZUjPszYBExERkXMcCjM6nQ4FBQXVHi8sLIRWq3V6UJ7oSgdN2s5lCvHTyb0zBazMEBEROc2haaZbb70VU6dOxa5duyCEgBACO3fuxOOPP47bbrvN1WP0CF7lZzPV1gAsTzP5a+Hv7QWAlRkiIiJXcCjMvP/++2jfvj0GDhwIb29veHt7Y9CgQejQoQMWL17s4iF6Bu0VpplsDcAh/jrodeyZISIichWHppmCgoLwyy+/4OTJkzhy5AiEEOjSpQs6dOjg6vF5DFvPjEUAZouAWiXZPW/b/TfUX4uLeQwzRERErlLvMHOl07A3b94s/3nRokUOD8hT2ZZmA9a+mar7x2TJq5l08momHjZJRETkvHqHmcTExHpdJ0nSlS+6BtmmmQBr34wP7MPM5fJppmA/LfzlaaYy9w2QiIjoGlXvMLNp06aGHIfHszUAAzX3zdiWZof6a6G37TPDygwREZHTHGoApuokSZIDTdUwU2a2ILfYWoUJ8ddVqswwzBARETmLYcaFvGrZBTinvCqjkoAgHy/2zBAREbkQw4wL2TbOM5jMdo/b9pgJ9tNBpZJYmSEiInIhhhkXaqHXAQDScortHrftMRPqb90dWc+zmYiIiFyGYcaFOoUHAACOpNsf9ZBVafdfAPDXWXcA5jQTERGR8xhmXKhzS1uYybd7PKuoYpoJQMWp2QwzRERETmOYcaFOLfUAgKMZVSsztkMmbZUZa5gxmi3V+muIiIjo6igaZhYsWIB+/fpBr9ejRYsWuP3223Hs2DG7a4QQmDdvHiIiIuDj44Phw4cjOTlZoRHXrXP5NNPpS4UoLasIKZWPMgAqwgzA6gwREZGzFA0z8fHxmD59Onbu3Im4uDiYTCaMGjUKRUVF8jVvvvkmFi1ahCVLlmDPnj0IDw/HyJEjUVBQUMdHVkZYgA7NfL1gEcCJi4Xy45UPmQQAtUqCb/lxB2wCJiIico6iYWbdunV4+OGH0bVrV/To0QPLli1DamoqEhISAFirMosXL8aLL76IiRMnolu3bvjqq69QXFyM5cuXKzn0GkmSVNEEnFHRN3NZPpdJKz9mq86wCZiIiMg5japnJi8vDwAQHBwMAEhJSUFGRgZGjRolX6PT6TBs2DBs3769xo9hMBiQn59v9+ZONTUBV63MAFyeTURE5CqNJswIIfDUU09hyJAh6NatGwAgIyMDABAWFmZ3bVhYmPxcVQsWLEBgYKD8FhkZ2bADr0JuAq60PDu7Ss8MAPh7W5dns2eGiIjIOY0mzMyYMQMHDx7E999/X+25qidxCyFqPZ37+eefR15envyWlpbWIOOtja0J+GhGPoQQKDGaUWS0NgMHV5pm0nMXYCIiIpeo96nZDWnmzJlYs2YNtmzZgtatW8uPh4eHA7BWaFq2bCk/npmZWa1aY6PT6aDT6Wp8zh1iwvyhkoCc4jJczDfAZLEeOqnVqOxWMck9MwwzRERETlG0MiOEwIwZM7Bq1Sr8+eefiI6Otns+Ojoa4eHhiIuLkx8zGo2Ij4/HoEGD3D3cevH2UqNdc38A1iZgeVm2n9aumsSN84iIiFxD0crM9OnTsXz5cvzyyy/Q6/VyH0xgYCB8fHwgSRLmzJmD+fPnIyYmBjExMZg/fz58fX0xadIkJYdep07hepzMLMSR9Hx0Crf20FRu/gVQ6bDJMrePj4iI6FqiaJhZunQpAGD48OF2jy9btgwPP/wwAGDu3LkoKSnBtGnTkJOTgwEDBmD9+vXQ6/VuHm39dW4ZgF8PpuNoegFCy0NMSKXmX6DSaiZWZoiIiJyiaJgRQlzxGkmSMG/ePMybN6/hB+QineVjDfLRJcLaEBziV3Nlhj0zREREzmk0q5muJba9Zk5dKsKF3BIA9suyAfbMEBERuQrDTAMID/BGoI8XzBaBXaezAdgvywYq98wwzBARETmDYaYBWI81sE41Hbto3TyvagOwrWeGxxkQERE5h2GmgdimmmyqNgD768p3AGZlhoiIyCkMMw3E1gRsE1pbAzArM0RERE5hmGkgttOzbWpdms19ZoiIiJzCMNNArgvTQ1Xp+KjaGoBLyywoM1vcOTQiIqJrCsNMA/HRqtE21A+A9VBJby+13fN+lc5pKmLfDBERkcMYZhqQrQk4uMoUE2A9eFKnsd5+9s0QERE5jmGmAXW2ncvkVz3MAJX7ZhhmiIiIHMUw04Bu7NQCWo0Kg9qH1vg8N84jIiJynqJnM13rukYEImneKOg06hqf13uX7zXDaSYiIiKHsTLTwGoLMgAPmyQiInIFhhkF8bBJIiIi5zHMKEiv48Z5REREzmKYURArM0RERM5jmFEQe2aIiIicxzCjIFZmiIiInMcwoyA9T84mIiJyGsOMgvy5AzAREZHTGGYU5K+zbprHnhkiIiLHMcwoSD7OoJRLs4mIiBzFMKMgHjRJRETkPIYZBVVUZhhmiIiIHMUwoyBbA3CR0QyzRSg8GiIiIs/EMKMgW2UGAIqMrM4QERE5gmFGQTqNCl5qCQCnmoiIiBzFMKMgSZIq+mbYBExEROQQhhmF2fpmuAswERGRYxhmFKYv3ziPlRkiIiLHMMwojIdNEhEROYdhRmF6uWeGuwATERE5gmFGYeyZISIicg7DjMK4momIiMg5DDMKY88MERGRcxhmFGbrmeE0ExERkWMYZhTGaSYiIiLnMMwozN/bus9MAcMMERGRQxhmFCZXZkq5NJuIiMgRDDMKCyhvAM4pZpghIiJyBMOMwmLC9ACAM1lF7JshIiJyAMOMwprrdYgI9IYQwKHzeUoPh4iIyOMwzDQCsa2DAAAHz+UqOg4iIiJPxDDTCMRGBgIADqSxMkNERHS1GGYagR7llZkDrMwQERFdNYaZRqBbK2tl5lxOCbIKDQqPhoiIyLMwzDQCgT5eaNfcDwBwkE3AREREV4VhppGwTTUdZN8MERHRVWGYaSRiW1unmriiiYiI6OowzDQSsXITcB6EEMoOhoiIyIMwzDQSXSMCoFFJuFxowIW8UqWHQ0RE5DEYZhoJby81ris/2uBgWq6ygyEiIvIgDDONSA/b5nnn2ARMRERUXwwzjUgPHmtARER01RhmGhFbE3DSuTxYLFduAjaaLA08IiIiosaPYaYRuS7MH95eKhQYTEjJKqrz2h/3pqHbK39g45GLbhodERFR48Qw04ho1Cp0jajffjN/nbwMo9mCXSnZbhgZERFR48Uw08jYNs+70gnalwuNAIDsImODj4mIiKgxY5hpZOp7gvbl8gMpGWaIiKipY5hpZGyVmcMX8lFmrr3B1xZmshhmiIioiWOYaWTahvghwFsDg8mCYxkFNV5jtgi5IpPDMENERE0cw0wjo1JJ6G6rzqTn13hNTrERtpXbnGYiIqKmjmGmEWod5AsAuFjLGU22KSYAKDSYYDCZ3TIuIiKixohhphEKC9ABAC4W1BJmCuyrMTlFZQ0+JiIiosaKYaYRah7gDQDIzDfU+HxWkaHO94mIiJoShplGKExvq8zUHFIuVXmcfTNERNSUMcw0Qi3kykxtPTP24YVhhoiImjKGmUbI1jNzqcBQ44GTlRuAAYYZIiJq2hhmGqFQfx0kCTBZBLKLqweVrPIw4+1l/d/HMENERE2ZomFmy5YtGD9+PCIiIiBJElavXm33vBAC8+bNQ0REBHx8fDB8+HAkJycrM1g38lKrEOKnBVBzE7BtmimmhR4AwwwRETVtioaZoqIi9OjRA0uWLKnx+TfffBOLFi3CkiVLsGfPHoSHh2PkyJEoKKh5Z9xrSXO9tW+mpuXZtmmmmDB/AAwzRETUtGmU/ORjxozBmDFjanxOCIHFixfjxRdfxMSJEwEAX331FcLCwrB8+XI89thjNb7OYDDAYKioZuTn17yLbmMXFqDDkXTgUpXKjBACWeWVmY5h1soMz2ciIqKmrNH2zKSkpCAjIwOjRo2SH9PpdBg2bBi2b99e6+sWLFiAwMBA+S0yMtIdw3W5Frbl2VVWNOWXmmAsP4DyuvIww/OZiIioKWu0YSYjIwMAEBYWZvd4WFiY/FxNnn/+eeTl5clvaWlpDTrOhhJmW55dZU8Z2xSTXqdByyDrNZxmIiKipkzRaab6kCTJ7n0hRLXHKtPpdNDpdA09rAZXW2Xmcnm4CfHXItjX2iScU2yExSKgUtV+X4iIiK5VjbYyEx4eDgDVqjCZmZnVqjXXoha1VGZs/TGh/jo0K1/xZBFAXgnPZyIioqap0YaZ6OhohIeHIy4uTn7MaDQiPj4egwYNUnBk7mGrzFTdBdg2zRTqr4OXWoUAb2txjU3ARETUVCk6zVRYWIiTJ0/K76ekpGD//v0IDg5GVFQU5syZg/nz5yMmJgYxMTGYP38+fH19MWnSJAVH7R62nplLhQa7KSTbNFOo3lqVCfbTIr/UhJwaNtcjIiJqChQNM3v37sWNN94ov//UU08BAKZMmYIvv/wSc+fORUlJCaZNm4acnBwMGDAA69evh16vV2rIbhPqb63MlJkFcoqNCCl//1L5suwQP+v7wX5anMkqlpdrExERNTWKhpnhw4dDiOpnD9lIkoR58+Zh3rx57htUI6HVWHcBzioyIrPAIIcZ21EGoXpbmLH+lyuaiIioqWq0PTMENK9hRZOtZ6a5v22ayQsAOM1ERERNFsNMI1bTXjO2c5ls01C2ygynmYiIqKlimGnEalrRZKvM2KadbAdSZhdVP5CSiIioKWCYacSqVmaKjSYUG80AgNDyaSbbXjPZxdxnhoiImiaGmUYsLMC+Z8Y2laTTqOCvs/ZuszJDRERNHcNMI9Zcb1+ZuVRpwzzbkQ5yZYY9M0RE1EQxzDRitspMZr41xMgb5pVPMQGVKjNczURERE0Uw0wjVnE+UymEEHbnMtkEl4eZ0jILio0m9w+SiIhIYQwzjVhzu12AyypVZirCjK9WDa3G+r+Ry7OJiKgpYphpxLQalVx5ySworThkUl8xzSRJkjzVxI3ziIioKWKYaeRayLsAG+QN82znMtnYAg9PziYioqaIYaaRk/tm8itXZmoOM1zRRERETRHDTCMXZtsFuMBQEWYqrWYCKsIMp5mIiKgpYphp5FoEVBxpUPVcJhtOMxERUVPGMNPI2Y40OJdTgrwS65EF1cKMb3llph5h5oWfk3Dn0u1cxk1ERNcMhplGztYAfDSjAACgVkkI8vGyuybYv36VmYLSMny/OxUJZ3Ow83RWA4yWiIjI/RhmGjlbA/D53BIA1h1/VSrJ7pqK85nqDjNJ5/IghPXPe8/kuHikREREymCYaeRaVFm5FFJligkAmtVzmikxLVf+c8JZhhkiIro2MMw0cs2rhJmqK5kAIKSe00yJqbnynw+cy0WZ2eL8AImIiBTGMNPI6TRqNPOt6JFpXkNlJrh8E728krJaA4oQAvsrVWZKyyxIvpDv2sESEREpgGHGA9hWNAHVN8wDgEAfL0jlbTS5xWU1fozzuSW4XGiARiVhUPsQAMDeM9muHywREZGbMcx4gMpTTbZm38rUKknum6mtCdhWlencMgCDO4QCYN8MERFdGxhmPIBdZaaGaSag8sZ5hhqf31/eL9MzMgh92zQDAOw9mwNhW95ERETkoRhmPEDlFU01TTMBlTfOq3mayVaZ6RkZhB6RQfBSS7hUYMC5nBLXDpaIiMjNGGY8gH1lpvo0E1DpsMkaKjNlZguSzucBAHpGBcHbS42uEYEAgL1n2TdDRESejWHGA9hVZmqZZmpWx/lMxzIKYDBZEOCtQXSIHwBUTDVx8zwiIvJwDDMeoEWlykxwDQ3AQEVjcE0b59k2y+sRGSTvHtynPMywCZiIiDwdw4wHaBfqB51GhZgW/vBS1/y/rK6Ts23Nv70ig+TH+rS1hpljFwvkAyyJiIg8kUbpAdCVNfPTIu7JYfD3rv1/V3Ad5zPtT7NWX3pGBcmPtdB7IyrYF6nZxUhMzcHwji1cO2giIiI3YWXGQ0SF+NY6xQTUHmbySspw6lIRAKBH6yC752x9M/s41URERB6MYeYaUVuYOXguFwAQFexb7ZBK21TTXg8MM+9tOIHxH2yr86TwgtIyJKZ63t+NiIiuDsPMNcIWZnKKjbBYKjbCq7xZXlV92wRbr0nLhcmDDp00WwQ+23oaSefz8FtSeq3XvfjzIdzx0Xb8drD2a4iIyPMxzFwjQvy18NWqUWYWePzbBBQZTAAqVjL1qtQvYxPTwh8B3hoUG804kl7gxtE651hGAQrK/35/nbhc4zVGkwUbjlwEAHyz84y7hkZERApgmLlG6DRqLLwzFlq1CusPX8RdH+/A+dwSu51/q1KpJPSWjzbwnM3z9lQ6IHP7qcswW6ofybD3bDaKjWYAwM7T2UjNKnbb+IiIyL0YZq4ht/WIwPdTr0eovxZH0vMx9r2tyC4yQqtWoUtEQI2vsTUBrz2UUWMoaIwqh5n8UpO8u3FlW47bV2z+m5DW4OMiIiJlMMxcY/q0aYbV0wejU7he3j+mc0QAdBp1jdeP7d4SOo0Ku1Oy8W7ccXcO1SFCCDnM2HZG/utk9ammLccvAQBu6mRdcv7fhHMeE9aIiOjqMMxcg1o388VPTwzCzZ3DAABDY0JrvbZdc3+8cWcsAGDJppP4vY6G2so2HcvE9O/2IT3PuYMqhRA4fCEfpWXmel1/LqcEF/MN0Kgk/P2GaADAtip9M5cKDDicng8A+NeErgjw1uBCXim2n6q5v4aIiDwbw8w1yk+nwScP9sHvs27ArBExdV57e69WeLQ8GDzz4wEczciv83qT2YKXfj6E35LS8exPSRDC8YrHsr/OYOz7WzHq3S2IL6+m1MXW29OtVSBGdgkHYD2SocRYEYa2nrB+nK4RAWjdzBe39YwAAPy495zD4yQiosaLYeYaplJJ6BIRUOsRCJU9e0snDOkQimKjGVO/TkBuce37t2w8monzudaKzJbjl7DmwAWHxldmtuCTLacBAKnZxZjyxW7MWL4Pmfmltb5mT/nBmP3aNkPbEF+0CvKB0WzB7kp9NLYppqHXNQcA3N0nEgCwLjkDecU8uoGI6FrDMEMAAI1ahQ/u74XIYB+kZhdj5veJtfaYfLPjLACgVZAPAOBf/ztc4wGXpWVmJKbm1Fq5WXcoAxn5pQj11+KRwdFQScCvB9Mx4p14fL87tcbX7Emxhpa+bYMhSRIGdwgBUNE3Y7EIbC2fdhoaYw0zsa0D0TFMD6PJgjUHHQteRETUeDHMkKyZnxb/eaAvfLzU2HriMr7debbaNSczC7Dt5GWoJODbvw/AdWH+yCoyYv7vR+yuS8suxu0f/oU7PtqOt/44VuPnW/ZXCgBg8oA2+Of4LlgzYwh6tA5EgcGE51clIaHKcvGcIiNOZBYCqFiFNaQ8sNgCzOH0fGQVGeGnVcsng0uShLv7tgYA/Hdv3auaCg0mbDtx+YpTbdlFRi73VogQAhsOX+T9JyIZwwzZ6RIRgBfGdQYAvL3+GC4VGOyet1VlRnQOQ3SoHxZMjIUkAT8mnMP28urIztNZmPDhXziaYd2I75Mtp3Ey035TvgNpudiXmgsvtYTJ10cBsPbBrJo2GBPKe1w+jj9t95qE8mMX2jf3k49mGNTeWpk5kp6Py4UGbCnvlxnYPgRaTcWX9+29WkGjknDgXB6OZVSMJTO/FOuTM/D6b4cxYck29Hh1PR74fBfGf7CtxlVSAHDqUiFGLorH0Lc2Ye5/D+ByoaHG65oSIUS9m7idtWJPGv7+9V6MfX8rdp3OcsvndIXMglLs8/DjNYQQ+HDTSYx5z7PufU2yi4z4ZscZZDWSf78lRjMe+GwXRi6Kx6zvE7F08ynEH7/E7y/1xDBD1UzqH4XurQJRUGrCwrVH5ccLDSb8tO88AOChgW0AWJeCPzDA+ucXfk7CV9vP4IHPdiG7yIjurQIxpEMoTBaBl1cn2003fbn9DADg1tgItNB7y4+rVRJm3mRtWI47fBEnyysxALCnvFLTr22w/Fiovw5dWlr30Nl+Kqtav0zl624sX6b98i+H8Lcv96D/6xvQf/5GTP0mAZ9uTcGBc3kwWwT03hqUmQUe+yYBh6rsYZOeV4IHP9uFrPJptR/2nsONb2/GF9tSPOpICFcwmS3YeToLr/16GEPf2oTO/1yHP5IzGvRzFhlMWFS+hUChwYSHvtiNTccyG/RzOutsVhGeX5WEIQs3YeJH2/FV+de+pyk2mjBjeSLe+uMYjqTn47FvE3A2q6jery8zW+wa9ZVUWmbGlC924+VfkvHg57tRbDQpPSS8t/EEtp28jBOZhVhz4ALeWHcUU77YjX6vb8A/fznktl8WPBXDDFWjVkl47fZukCTgp33n5H1dft53DoUGE9o198Pg9hXLvefe0hHhAd44k1WMV9Ykw2QRGN8jAj88NhALJnaHTqPCjtNZ+F/5GUmZ+aX4tbx35f8Gt632+Tu08MfILtZl5Z9trajO7C1v/u1bKcwAwJDyped/JGfI1Rtbv0xl9/S1NgLvTsnGxqOZyCwwQCVZj3W4v38UFt/bE389dxP2vnQzBrYLQaHBhIeX7Za/YecUGfHg57txIa8U7UL98MXDfdGtVQAKSk3416+HMfb9rdid4vqdlIUQyCyovSna3UxmCxb8fgT9Xt+A+z7Zic+3pSAtuwRCAG+sPdqg+/l8uvU0LhUYEBXsixs7NofBZMHUr/c6fP7W9pOX8fm2FPyelI59qTlIzytx2fiPpOdj5veJuPHtzfh+dyqM5WF3/u9HcPyi5xwfAgAXcktw98c78FtSOrzUEtqG+CK3uAx/+2ovCkqv3FS/6VgmBi/8E9cv2IjNCodPIQSeX5Ukb7Z5OD0fT608YHemnbsdzciXv9c9N6YT5t7SEbfGtkS75n4QAvh6x1nc/uFfdr/ckT1JOLOu1gPk5+cjMDAQeXl5CAioeRdcqtnzqw7i+91p6BSux/9mDsGY97biZGYh5o3vgocHR9td+0dyBh77JgEA8I/RHTFteHtIkgQAeH/jCSyKO44Weh02Pj0Mn25NwfsbT6BPm2b46YlBNX7uvWeycdfHO6BVq7Dt2RsR4OOF7vP+QJlZIP4fw9EmxE++Nv74JUz5YjckCRDCekL4lrk3VvuYZovAwrVHkFVeNereKhBdIgLgq9VUuza/tAz3/WcnDqfno02IL75+pD9mrdiPA2m5CA/wxk/TBqFVkA/MFoGVe9Lw1h9HkVO+UmrygCg8N6YT9N5ejt34Si7ml+LJlfux/VQW7u8fiVdv62Y3fWZTYjRj5+ksRAT5IKaFP1QqyenPXZvvd6fi+VVJAIBAHy+M6NwCN3ZsgZd/OYTc4jK8d19PTOjZ6oofp7TMjHM5JRBCICrEt9aNHW0yC0ox/K3NKDaasWRSL4zqEo6nftiPXw+mQyUBCyfG4p5+kfX+e3yz8yxeXn2o2uM+XmrMn9gNd/RqXe+PVVXc4Yt47Ju9sP18vLFjczwxvAM+3HQS8ccvoXPLAKyePuiKf2fAGqJVKgmBPs5/PV3JhsMXsSrxHJr769CqmQ9aBflCrQJeWp2My4UGBPtp8fEDfdAmxBe3LdmGi/kG3NSpBT59qC/UNXzNFRlMeP33I1i+q6KhXyVZV09OHdpO/h7hTp9vS8Frvx6GWiXhmVEd8W7ccRjNFsy8qQOeHtXxiq9PvpCHk5mFGNOtZY3/Fmtitgh8t+ssooJ9MbxjC7vnLBaBuz7ejn2puRjdNQz/ebCv3fObj2Xi6R8OIKvICB8vNf41oSvu6tNakXvnblfz85thhmqVXWTETe9sRm5xGcZ2D8fvSRnw1aqx84URCKjhB/XGIxcR5KuVG29tSsvMGL14C85mFePB69tg7aF0XC40YsmkXrg1NqLWz3/n0u1IOJuDacPbY3jHFrjnPzvQXK/D7hdG2P1DLjGa0ePV9fJvvg9cH4V/397d6b9/ZkEp7ly6HWnZJfBSSygzCwT5euHHxwYiJkxvd21usRFvrDuK73dbG4zDA7zx79u74ebyCpMjNh65iGd+PCCHJAAYEB2Mjx/og2blp6QDwI5TWXhu1UGcLW+Ibebrhf7RwRgQHYJhHZujfXN/h8dQVYnRjOFvb8LFfAOevPk6TL+xPTTlS/9tofW6MH+smz20WqBKyy7GuxuO41RmIc7nluByYcUKOJVk3eyxXXM/tG/uj3v7ReK6Kvf4hZ+TsHxXKnpEBmH1tEGQJAlmi8CLPydhxR7rfa8apGuz5sAFzF6RCCGAge1CUGa2ID2vFBfzS2GyCOg0KqyZMQQdw/V1fpyanLpUiAlL/kKhwYSbOrXA06OuQ9eIQADWr6lbFluPGZk6tB1eGNu5zo+1NikdT/94AEIAf78hGlOHtnNJSK7J8YsFuG3JNpSW1Txd2ilcj08f6ovIYF8AwMFzubj74x0wmCx4bGg7PF/l75JwNhtP/XBA/rp8ZHA0io0m+f/VhJ4ReOPOWHh7qXGpwICNRy5i/eGLSLlcBJ1GBW8vNby9VPDVajCpf5RT/5Zstp+8jAe/2A2zReCft3bBI0Oi8d+Ec3jmxwMAcMUgfuZyEca9vxVFRjPahfrh5Vu7yNPXdflmxxm8/EsygOpfo9/tOosXfz4EP60aG54ehpaBPtVen5lfijnlv9QAwJhu4Xh0aDv0igy64te6xSKw9lAGvtpxBgaTBQHeGgR4eyHAR4OwAG+M7xHh0u8RrsQwUwnDjHOW70rFCz8nye87GhQ2H8vEw8v2yO+3DPTGlrk31rkHzvrkDEz9JgF6bw0euL4Nlm4+hbHdw/HR5D7Vrr3/k53YUd6Q+MmDfTCqa/hVj7EmKZeLcNfS7cgqMsJXq8Z3fx+AXlHNar1++6nLeGFVEs6UfwO/rUcE5k/sDn9d9eqP7eNvOpqJ5nod2ob4oU2oL3QaFRauPYplf50BYN38b/KANpj/+xEUGkxoE+KLz6f0RXigD95YexTflK86C/bTosRoRkmluXVJAu7s3RpPj7quxm+SV2vp5lN4Y91RtArywZ/PDLOrLOSVlGHIG3+ioNSEpZN7Y0z3lvJzRQYTJtRQJvfTqiFJEgoN9j0LWo0K/7y1CyYPiIIkSTiZWYDRi7fCbBH44bGB6B9dMdUohMDCtUfxn/I9i+7tG4l/39Gt1q+tTUcz8ejXe2GyCDw0sA1eva2r/APBbBH421d7sPnYJVwX5o9fpg+Bj/bK1RObQoNJng7oHx2M7/4+oNo44g5fxKNf74UkAd/9bQAGdai+Q7fFIrB44wm8v/GE3ePBflrMvKkDJg9oU60qYLYI5JeUIbekDLnFRhSUmtCqmQ/ahvjVWDWprLTMjNuWbMPxi4Xo17YZ+rQJxoXcEpzPLUFGXimubxeCVyd0rfZ1/Mv+85i9Yj8Aa0XSaLLgXE4J0nKKcT7XOvUYEeiNt+/ugUEdQiGEwDc7z+Jf/zsMk0Wgc8sA+GnVSEjNQV0/iXy1aqydfYNdRfZqpWUX47Yl25BTXIaJvVvhnbt7yP/fF6w9gv/En4ZWo8LKqdfX+G/caLLgzqXbq50Fd1OnFnj51i6IDq15bDlFRgx/e7N8vAwA3N8/Eq9N6IbsYiNufice+aUmOVzVxmwR+Dj+FBbFHZenQjuF6zF5QBRu79WqWsgVQuDPo5l4Z/1xeUf02vSPDsb9/SMxpltLeHvV/+u9oTHMVMIw4xyzRWDiR3/hwDnrP+D1Tw6t9htzfT32zV78kXwRgLXPZtrwDnVeb7EI3PxuPE5fKoJWrYLRbKn1H/yHm07irT+OQaOSkPjPkS797fXwhXwsjT+FBwZEYUC7kCteX1pmxrsbjuPTLadhEUDnlgH44uG+1cLEhsMXMXtFIoqqNEV6e6nk344fGRyNZ8d0hE6jxvGLBfjbV3uQll0CvU6DAB8vefPCSQOi8PyYTvD2UiPpfB52ns7CjlNZ8pJ1by8VHr2hHR4b1r7WYFVQWobdKdlITM3FkJhQXF/l75pXXIYb3vwT+aUmvHN3D9zZp/o0zKL1x/D+nyfRuWUAfp81BJIkQQiBGd8n4reD6QgL0OHV27qidTNftG7mI0+dXCo04PSlIpy+VIS1h9LlcY/pFo6Fd8bi6R8OYMORixjZJQyfPtS32ucFgK93nMG8NcmwCOCGmFB8NLl3ta+DPWey8eDnu1BaZsGEnhF4956e1SpIlwsNuGXxVlwuNGDygCi8fkf9wrsQAtO+24e1hzIQFqDDrzNvQPPy88Oqen5VEr7fnYrwAG+sm3MDgnwrKm2FBhOeXLkfcYet/1YeGRyNfm2b4a31x3D6krV/KyLQG6F6HQoNJhQbzCgymFBoNNUYCHy81OgYrkfnlgHoH90Mt/VoVS3cvPhzEr7blYpQfx3Wzq593DV5+49jWLLpZI3PTezVCq/c1rXaFNmOU1mYvnwfsivtT9WjdSBGdQ1H76hmMFksKC2zoKTMjK+3n8Heszno3zYYK6Ze79AUaqHBhLs/3oEj6fno3ioQPz4+0O6Httki8Ng3e7HhSCZC/bX4bEo/9IwMsvsY//71MD7bliJXZ39MOIdlf6WgzCzgpZbw3JjO+FsN35teWp2Eb3emolO4Hvf2i8Rrvx6GRQDDOzaHj5caaw9loHurQKyePviKoRMADp3Pwxd/peC3g+kwmKzfJ3y81IgO9UOoXofm/jqE6rXyv2UA8Ndp8Lch0egaYe3xKygtQ0GpCfvTcrHpWKY8HRrgrcHMm2Lw6NB2V3V/07KL4a/T2FWMXYFhphKGGeclncvDvZ/swA0xodXmc6/G+dwSjH53C1QSEP+PG+v1hb9idyqeW1VRGfp15hB0axVY7brTlwox9v2tGNE5DB9O6u3wGF1pX2oOpn6dgMuFBoQHeOOLh/uhS0QAhBD4ZMtpLFx3FEJYKy8+XmqcySqWl2EG+2nx9t2xuKmTfWk9u8iIx79JkHc8jgz2wcKJsRhcw2/3AJCYmoP5vx+Rd04O9deiX9tgBPp4IdDXC4E+XsgvMWHH6SwcOp8n/8bnpZbwwf29cEu3iuqK7bfXjmF6/D77hhq/8eYWGzF44Z8oMprx6UN9MbJLmNyjoFFJWPnY9ejTJrja6yqzWAS++CsFb6w7ijKzQAu9DpkFBqhVEv6YMxQdWtReEt945CJmLE9ESZkZncL1eHpURxSUliG7yGhdirvzLApKTbixY3N88lDfWqs3W09cwoOf7wYAfPxAb7v7UJuP409h4dqj8FJLWPnYQPSuo4JXbDRh3PvbkHK5CC0DvREV7ItQfx1C/LXYcSoLJzILodWoMP+O7rirPDSazBas3JuGxRtOVNsyoTK9ToNAXy/4atVIzS6uNm3UMzIIb9wZK0+h/Z6Ujmnf7YMkAd88MkBuqK8vi0Xg/T9P4HxOCSKDfREZ7IPWzXzRJsTXbqViVWnZxfh062m54b+2ymFadjFuWbwFRUYzXhrXGX+/4ep+0BpNFvztqz3YeuIyQvy0+N/MIYgIqv65Cg0m3PufHUi+kA+dRoW37u6B23pYp8H/PHoRj3y5FwDkr2vA+n3ntV8PY9Mx6yrKhRO7477+UfLHTL6Qh/EfbINFACumXo/r24VgfXIGZq1IlP+/qCTgl+lD0L119e9rdcktNmLVvvNYvju11sZgby8Vpgxqi8eHtq/1+21GXil+3JuGlXvTcC7H+svRlapEZWYL9pzJxqajmfjzaCZOXSrCK+O74P8G1/4aRzDMVMIw4xqFBhO8NSq5P8JR53Ks0y+tm/nW6/rSMjOGvLEJlwsN8NOqceCVUbWOIb+0DN4adb2b8twhLbsY//flHpzMLISfVo137+2J9Ycv4r8J1nOiJg+Iwrzbuso/VAsNJpzPKUHrZj7wq6WCYjRZsORP6/TDY8Pa13qdjRACfyRnYOHao/L0V23ahPgi2E+LxNRcqFUSFt3TAxN6tkJ6XgmGv7UZBpMFn0/pixGda+9feGPdUSzdfArdWwXi5Vu7YNKnO2GyiKv+ZncgLRczvt+HtGzrN9j6VkmSzuXhka/21PoDv1/bZvj6kQFXnD6yhbcAbw3Wzhkq73hdk60nrE3oFgG8fkc3TC7frqAuB9Jycf+nO1Fcw3LlFnod/vNgnxqnO4qNJvx1MgsqyXoGm79OI/83yNfLLqCZLQIpl4twJD0fhy7kYfnOVBQYTPBSS5h+YwdM6NkKty3ZhoJSE54Y3h7P3tLpiuNWgm26W6tR4fdZN9QZaCuzWASe+mE/Vu+/AB8vNVZMvR49qlRcKisoLcPsFfvx51HriqtZI2Jwf/9IjHt/G7KLjHh4UFvMu61rtde9ue4oPtp8CioJ+GhyH9zSLRxCCNz7n53YfSYbt8a2xJJKv2TtT8vF377cg6w6PmZ9CSFwIrMQF3JLcKnAgEuFBlwqMECvs07PtwioPVBWZrEIfPDnSby7wbr1QU39Q5cKDHhz3VGsO5SBgkpTw2qVhL8Pia7WN+UshplKGGY8n20K6caOzbHs//orPZyrlldchse/TZB7egDrb2OvjO+Khwa2cduqBKPJgj+PZuJifinySsrkN7UkoV90MAa2D5FXaM3970H8tO8cJAl4485Y7DubgxV70tCvbTP88NjAOsecVWjAkDc2oaTMDF+tGsVGM27rEYH37ut51X/X/NIy/PvXw0jNLsaSSb0R6l+/6Y/zuSX45+pDyMgvRbCfFs18tQj206J1Mx/c1z+q1qm2yowmC+7+eDsOnMtDr6ggfDipd7Xf6KtWke7tG4mFd3av99/zcqEBxzMKkFVkRFahAVlFRkgAJl/fBmH1/CF0NTLySvHS6kPYcMQ6haVRSTBZBHpFBeGHxwbW6xw3JQghMGXZHmw5fgk9IoPw0+MDoVGrIITAvtQcrE++iBB/Le7s3VreUBOwLoP/ZMtpaFQSPpvSt9pKopqYLQJvrDsqnxsX4K1BfqkJXVoG4OdaVqAJIfDcT0lYuTcNWo0KXz/SHxfzSzF7xX54e6nw59PDq33tZOSVYldKFsZ2b9lo7rsQAq/+7zC+3H4GGpWEzx/uh2HXNYcQAj8nnse/fj2M3PIFCSF+Wgzr2Bw3dWqBG2KaN8hqO4aZShhmPJ/JbMGPCecwpEOovJLC0xhNFjy36iBW7TsPvbcGH07qXW1jv8bEYhF46ZdD8pJalQRYBPDTEwOvOE0EAK//dhifbrUeV3FdmD9+njb4ihWkxuhsVhHGvb/NWpn0UuGxoe3x+LD28NFaV+A88+MB+bT30V3D8N59vRpVA2VNhBD49WA65q1JRlaREXpvDX6fdUOj/7eVnleCUe9uQUGpCVOHtoOfVoNViefk1VIAoFWrcEu3cEweEIWD5/LwevkxK4vu6YGJva9uqf0Pe9Pw4s9JKDML+GrV+N/MIXWu+jGZLZj23T6sP3wRep0G3uVfI0+PvA4zR8Q49pdWgMUiMGflfqw5YK1mvXtvT6zckypPpXWNCMA/b+2Cfm2DG3QLCIBhxg7DDDUWQghsO3kZ7Zv71zhn39gIIfCvXw/Lq6rqar6tKrOgFCPejgcArJ4xuNEu/ayPwxfy8cqaQ3LfUctAb0weEIUvt5/B5UIjdBoVXq608spTZBcZsXzXWdwQ07zOqZfGZNW+c3jqhwN2j/lq1RjVJQwpl4vkhQqVPTemEx4f1t6hz7c7JRsfbjqJKYPaVOtfq0lpmRkPfb7brqct7slhjT7gVlW5z8hGq1Fh9ogYTB3azm2VJIaZShhmiBwnhMBHm09h87FMvH13j6taGnshtwRqldQg0yXuJoTA70kZmP/7EXkFGWBdGvv+/b0cXuFHV0eIiqrBkA6hmNi7FUZ3DZc3vkw6l4flu89ideIFlJSZ8cjgaLx8a2e3hsy8kjLc98lOHEnPx2cP9XXJ/jhKKDKYMOnTnThwLg992jTDG3fG1rtXyVUYZiphmCEiVyktM+PzbSn4dudZjO4ajufKl8OT+1gsAkazpc77nl9ahpRLRYhtHahItcy2u7W7f/i7WmmZGckX8tAzslm9lo27GsNMJQwzREREnudqfn43jhZqIiIiIgcxzBAREZFHY5ghIiIij8YwQ0RERB6NYYaIiIg8GsMMEREReTSGGSIiIvJoDDNERETk0RhmiIiIyKMxzBAREZFHY5ghIiIij8YwQ0RERB6NYYaIiIg8GsMMEREReTSN0gNoaEIIANajxImIiMgz2H5u236O1+WaDzMFBQUAgMjISIVHQkRERFeroKAAgYGBdV4jifpEHg9msVhw4cIF6PV6SJLk0o+dn5+PyMhIpKWlISAgwKUfm+zxXrsP77X78F67D++1+7jqXgshUFBQgIiICKhUdXfFXPOVGZVKhdatWzfo5wgICOA/DjfhvXYf3mv34b12H95r93HFvb5SRcaGDcBERETk0RhmiIiIyKMxzDhBp9PhlVdegU6nU3oo1zzea/fhvXYf3mv34b12HyXu9TXfAExERETXNlZmiIiIyKMxzBAREZFHY5ghIiIij8YwQ0RERB6NYcZBH330EaKjo+Ht7Y0+ffpg69atSg/J4y1YsAD9+vWDXq9HixYtcPvtt+PYsWN21wghMG/ePERERMDHxwfDhw9HcnKyQiO+dixYsACSJGHOnDnyY7zXrnP+/Hk88MADCAkJga+vL3r27ImEhAT5ed5r1zCZTHjppZcQHR0NHx8ftGvXDv/6179gsVjka3ivHbNlyxaMHz8eERERkCQJq1evtnu+PvfVYDBg5syZCA0NhZ+fH2677TacO3fONQMUdNVWrFghvLy8xKeffioOHz4sZs+eLfz8/MTZs2eVHppHGz16tFi2bJk4dOiQ2L9/vxg3bpyIiooShYWF8jULFy4Uer1e/PTTTyIpKUnce++9omXLliI/P1/BkXu23bt3i7Zt24rY2Fgxe/Zs+XHea9fIzs4Wbdq0EQ8//LDYtWuXSElJERs2bBAnT56Ur+G9do1///vfIiQkRPz6668iJSVF/Pjjj8Lf318sXrxYvob32jG///67ePHFF8VPP/0kAIiff/7Z7vn63NfHH39ctGrVSsTFxYl9+/aJG2+8UfTo0UOYTCanx8cw44D+/fuLxx9/3O6xTp06ieeee06hEV2bMjMzBQARHx8vhBDCYrGI8PBwsXDhQvma0tJSERgYKD7++GOlhunRCgoKRExMjIiLixPDhg2Twwzvtes8++yzYsiQIbU+z3vtOuPGjROPPPKI3WMTJ04UDzzwgBCC99pVqoaZ+tzX3Nxc4eXlJVasWCFfc/78eaFSqcS6deucHhOnma6S0WhEQkICRo0aZff4qFGjsH37doVGdW3Ky8sDAAQHBwMAUlJSkJGRYXfvdTodhg0bxnvvoOnTp2PcuHG4+eab7R7nvXadNWvWoG/fvrj77rvRokUL9OrVC59++qn8PO+16wwZMgQbN27E8ePHAQAHDhzAtm3bMHbsWAC81w2lPvc1ISEBZWVldtdERESgW7duLrn31/xBk652+fJlmM1mhIWF2T0eFhaGjIwMhUZ17RFC4KmnnsKQIUPQrVs3AJDvb033/uzZs24fo6dbsWIF9u3bhz179lR7jvfadU6fPo2lS5fiqaeewgsvvIDdu3dj1qxZ0Ol0eOihh3ivXejZZ59FXl4eOnXqBLVaDbPZjNdffx33338/AH5dN5T63NeMjAxotVo0a9as2jWu+NnJMOMgSZLs3hdCVHuMHDdjxgwcPHgQ27Ztq/Yc773z0tLSMHv2bKxfvx7e3t61Xsd77TyLxYK+ffti/vz5AIBevXohOTkZS5cuxUMPPSRfx3vtvJUrV+Lbb7/F8uXL0bVrV+zfvx9z5sxBREQEpkyZIl/He90wHLmvrrr3nGa6SqGhoVCr1dWSZGZmZrVUSo6ZOXMm1qxZg02bNqF169by4+Hh4QDAe+8CCQkJyMzMRJ8+faDRaKDRaBAfH4/3338fGo1Gvp+8185r2bIlunTpYvdY586dkZqaCoBf1670j3/8A8899xzuu+8+dO/eHQ8++CCefPJJLFiwAADvdUOpz30NDw+H0WhETk5Ordc4g2HmKmm1WvTp0wdxcXF2j8fFxWHQoEEKjeraIITAjBkzsGrVKvz555+Ijo62ez46Ohrh4eF2995oNCI+Pp73/iqNGDECSUlJ2L9/v/zWt29fTJ48Gfv370e7du14r11k8ODB1bYYOH78ONq0aQOAX9euVFxcDJXK/seaWq2Wl2bzXjeM+tzXPn36wMvLy+6a9PR0HDp0yDX33ukW4ibItjT7888/F4cPHxZz5swRfn5+4syZM0oPzaM98cQTIjAwUGzevFmkp6fLb8XFxfI1CxcuFIGBgWLVqlUiKSlJ3H///VxW6SKVVzMJwXvtKrt37xYajUa8/vrr4sSJE+K7774Tvr6+4ttvv5Wv4b12jSlTpohWrVrJS7NXrVolQkNDxdy5c+VreK8dU1BQIBITE0ViYqIAIBYtWiQSExPlLUnqc18ff/xx0bp1a7Fhwwaxb98+cdNNN3FpttI+/PBD0aZNG6HVakXv3r3l5cPkOAA1vi1btky+xmKxiFdeeUWEh4cLnU4nhg4dKpKSkpQb9DWkapjhvXad//3vf6Jbt25Cp9OJTp06iU8++cTued5r18jPzxezZ88WUVFRwtvbW7Rr1068+OKLwmAwyNfwXjtm06ZNNX5/njJlihCifve1pKREzJgxQwQHBwsfHx9x6623itTUVJeMTxJCCOfrO0RERETKYM8MEREReTSGGSIiIvJoDDNERETk0RhmiIiIyKMxzBAREZFHY5ghIiIij8YwQ0RERB6NYYaIiIg8GsMMEblM27ZtsXjx4npfv3nzZkiShNzc3AYbU2NytfeHiOpHo/QAiEg5w4cPR8+ePV32A3bPnj3w8/Or9/WDBg1Ceno6AgMDXfL5iahpYpghojoJIWA2m6HRXPnbRfPmza/qY2u1WoSHhzs6NCIiAJxmImqyHn74YcTHx+O9996DJEmQJAlnzpyRp37++OMP9O3bFzqdDlu3bsWpU6cwYcIEhIWFwd/fH/369cOGDRvsPmbVaRRJkvDZZ5/hjjvugK+vL2JiYrBmzRr5+arTTF9++SWCgoLwxx9/oHPnzvD398ctt9yC9PR0+TUmkwmzZs1CUFAQQkJC8Oyzz2LKlCm4/fbb6/z7bt++HUOHDoWPjw8iIyMxa9YsFBUV2Y39tddew6RJk+Dv74+IiAh88MEHdh8jNTUVEyZMgL+/PwICAnDPPffg4sWLdtesWbMGffv2hbe3N0JDQzFx4kS754uLi/HII49Ar9cjKioKn3zySZ3jJqIrY5ghaqLee+89DBw4EI8++ijS09ORnp6OyMhI+fm5c+diwYIFOHLkCGJjY1FYWIixY8diw4YNSExMxOjRozF+/HikpqbW+XleffVV3HPPPTh48CDGjh2LyZMnIzs7u9bri4uL8fbbb+Obb77Bli1bkJqaimeeeUZ+/o033sB3332HZcuW4a+//kJ+fj5Wr15d5xiSkpIwevRoTJw4EQcPHsTKlSuxbds2zJgxw+66t956C7Gxsdi3bx+ef/55PPnkk4iLiwNgrVDdfvvtyM7ORnx8POLi4nDq1Cnce++98ut/++03TJw4EePGjUNiYiI2btyIvn372n2Od955B3379kViYiKmTZuGJ554AkePHq1z/ER0BS45e5uIPNKwYcPE7Nmz7R7btGmTACBWr159xdd36dJFfPDBB/L7bdq0Ee+++678PgDx0ksvye8XFhYKSZLE2rVr7T5XTk6OEEKIZcuWCQDi5MmT8ms+/PBDERYWJr8fFhYm3nrrLfl9k8kkoqKixIQJE2od54MPPiimTp1q99jWrVuFSqUSJSUl8thvueUWu2vuvfdeMWbMGCGEEOvXrxdqtVqkpqbKzycnJwsAYvfu3UIIIQYOHCgmT55c6zjatGkjHnjgAfl9i8UiWrRoIZYuXVrra4joyliZIaIaVa0oFBUVYe7cuejSpQuCgoLg7++Po0ePXrEyExsbK//Zz88Per0emZmZtV7v6+uL9u3by++3bNlSvj4vLw8XL15E//795efVajX69OlT5xgSEhLw5Zdfwt/fX34bPXo0LBYLUlJS5OsGDhxo97qBAwfiyJEjAIAjR44gMjLSrnpluxe2a/bv348RI0bUOZbK90OSJISHh9d5P4joytgATEQ1qroq6R//+Af++OMPvP322+jQoQN8fHxw1113wWg01vlxvLy87N6XJAkWi+WqrhdCVHussqrPV2WxWPDYY49h1qxZ1Z6Lioqq87W2zyWEqPZ5qz7u4+NT58cCrv5+ENGVsTJD1IRptVqYzeZ6Xbt161Y8/PDDuOOOO9C9e3eEh4fjzJkzDTvAKgIDAxEWFobdu3fLj5nNZiQmJtb5ut69eyM5ORkdOnSo9qbVauXrdu7cafe6nTt3olOnTgCsVZjU1FSkpaXJzx8+fBh5eXno3LkzAGvVZePGjU7/PYno6rAyQ9SEtW3bFrt27cKZM2fg7++P4ODgWq/t0KEDVq1ahfHjx0OSJLz88suKVBRmzpyJBQsWoEOHDujUqRM++OAD5OTk1Fg1sXn22Wdx/fXXY/r06Xj00Ufh5+eHI0eOIC4uzm7F0l9//YU333wTt99+O+Li4vDjjz/it99+AwDcfPPNiI2NxeTJk7F48WKYTCZMmzYNw4YNk6fkXnnlFYwYMQLt27fHfffdB5PJhLVr12Lu3LkNe1OImjhWZoiasGeeeQZqtRpdunRB8+bN6+x/effdd9GsWTMMGjQI48ePx+jRo9G7d283jtbq2Wefxf3334+HHnoIAwcOlPtfvL29a31NbGws4uPjceLECdxwww3o1asXXn75ZbRs2dLuuqeffhoJCQno1asXXnvtNbzzzjsYPXo0AOt00OrVq9GsWTMMHToUN998M9q1a4eVK1fKrx8+fDh+/PFHrFmzBj179sRNN92EXbt2NcyNICKZJK402UxE1IhZLBZ07twZ99xzD1577TWHP07btm0xZ84czJkzx3WDIyK34DQTEXmUs2fPYv369Rg2bBgMBgOWLFmClJQUTJo0SemhEZFCOM1ERB5FpVLhyy+/RL9+/TB48GAkJSVhw4YNchMuETU9nGYiIiIij8bKDBEREXk0hhkiIiLyaAwzRERE5NEYZoiIiMijMcwQERGRR2OYISIiIo/GMENEREQejWGGiIiIPNr/AykvyBMolM/vAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 为对比学习负采样准备词频率分布\n",
    "vocab_size = len(dataset.token2id)\n",
    "embed_size = 128\n",
    "distribution = dataset.get_word_distribution()\n",
    "print(distribution)\n",
    "model = SkipGramNCE(vocab_size, embed_size, distribution)\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import SGD, Adam\n",
    "\n",
    "# 定义静态方法collate_batch批量处理数据，转化为PyTorch可以需要的张量类型\n",
    "class DataCollator:\n",
    "    @classmethod\n",
    "    def collate_batch(cls, batch):\n",
    "        batch = np.array(batch)\n",
    "        input_ids = torch.tensor(batch[:, 0], dtype=torch.long)\n",
    "        labels = torch.tensor(batch[:, 1], dtype=torch.long)\n",
    "        return {'input_ids': input_ids, 'labels': labels}\n",
    "\n",
    "# 定义训练参数以及训练循环\n",
    "epochs = 100\n",
    "batch_size = 128\n",
    "learning_rate = 1e-3\n",
    "epoch_loss = []\n",
    "\n",
    "data_collator = DataCollator()\n",
    "dataloader = DataLoader(data, batch_size=batch_size, shuffle=True,\\\n",
    "    collate_fn=data_collator.collate_batch)\n",
    "optimizer = Adam(model.parameters(), lr=learning_rate)\n",
    "model.zero_grad()\n",
    "model.train()\n",
    "\n",
    "# 需要提前安装tqdm\n",
    "from tqdm import trange\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 训练过程，每步读取数据，送入模型计算损失，并使用PyTorch进行优化\n",
    "with trange(epochs, desc='epoch', ncols=60) as pbar:\n",
    "    for epoch in pbar:\n",
    "        for step, batch in enumerate(dataloader):\n",
    "            loss = model(**batch)\n",
    "            pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            model.zero_grad()\n",
    "        epoch_loss.append(loss.item())\n",
    "    \n",
    "epoch_loss = np.array(epoch_loss)\n",
    "plt.plot(range(len(epoch_loss)), epoch_loss)\n",
    "plt.xlabel('training epoch')\n",
    "plt.ylabel('loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9430e9a",
   "metadata": {},
   "source": [
    "TF-IDF加权\n",
    "\n",
    "定义词频率（term frequency）。注意到不同长度的文章词频率会有较大差距，不利于比较和运算，因此可以对词频率取对数。\n",
    "\n",
    "$$\\text{tf}_{t,d} = \\log (\\text{count}(t,d) + 1)$$\n",
    "\n",
    "其中$\\text{count}(t,d)$表示词$t$在文档$d$中出现的次数，为了避免对0取对数，把所有的计数加1。\n",
    "\n",
    "那么如何区分高频词与低频词呢？TF-IDF引入了另一个重要的评价指标——文档频率（document frequency），即一个词在语料库所包含的多少篇文档中出现。在所有文档里出现的词往往是虚词或是常见实词，而只在少量文档里出现的词往往是具有明确含义的实词并且具有很强的文档区分度。用$\\text{df}_t$来表示在多少篇文档中出现了词$t$。\n",
    "\n",
    "为了压低高频词和提升低频词的影响，TF-IDF使用文档频率的倒数，也就是逆向文档频率（inverse document frequency）来对词频率进行加权。这很好理解，一个词的文档频率越高，其倒数就越小，权重就越小。\n",
    "\n",
    "$$\\text{idf}_t = \\log \\frac{N}{\\text{df}_t}$$\n",
    "\n",
    "其中$N$表示文档总数。为了避免分母为0，通常会将分母改为$\\text{df}_t+1$。\n",
    "\n",
    "基于词频率和逆向文档频率，得到TF-IDF的最终值为：\n",
    "\n",
    "$$w_{t,d} = \\text{tf}_{t,d} \\times \\text{idf}_{t}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f765e353",
   "metadata": {},
   "source": [
    "很多情况下会额外对文档的TF-IDF向量使用L2归一化，使得不同文档的TF-IDF向量具有相同的模长，便于相互比较。\n",
    "下面给出了TF-IDF的代码实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9ce8e610",
   "metadata": {},
   "outputs": [],
   "source": [
    "class TFIDF:\n",
    "    def __init__(self, vocab_size, norm='l2', smooth_idf=True,\\\n",
    "                 sublinear_tf=True):\n",
    "        self.vocab_size = vocab_size\n",
    "        self.norm = norm\n",
    "        self.smooth_idf = smooth_idf\n",
    "        self.sublinear_tf = sublinear_tf\n",
    "    \n",
    "    def fit(self, X):\n",
    "        doc_freq = np.zeros(self.vocab_size, dtype=np.float64)\n",
    "        for data in X:\n",
    "            for token_id in set(data):\n",
    "                doc_freq[token_id] += 1\n",
    "        doc_freq += int(self.smooth_idf)\n",
    "        n_samples = len(X) + int(self.smooth_idf)\n",
    "        self.idf = np.log(n_samples / doc_freq) + 1\n",
    "    \n",
    "    def transform(self, X):\n",
    "        assert hasattr(self, 'idf')\n",
    "        term_freq = np.zeros((len(X), self.vocab_size), dtype=np.float64)\n",
    "        for i, data in enumerate(X):\n",
    "            for token in data:\n",
    "                term_freq[i, token] += 1\n",
    "        if self.sublinear_tf:\n",
    "            term_freq = np.log(term_freq + 1)\n",
    "        Y = term_freq * self.idf\n",
    "        if self.norm:\n",
    "            row_norm = (Y**2).sum(axis=1)\n",
    "            row_norm[row_norm == 0] = 1\n",
    "            Y /= np.sqrt(row_norm)[:, None]\n",
    "        return Y\n",
    "    \n",
    "    def fit_transform(self, X):\n",
    "        self.fit(X)\n",
    "        return self.transform(X)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
